This notebook outlines the process used and choices made to create a bivariate choropleth map showing median household income and median abortion rate per Indiana county from 2017 to 2021.

A choropleth map uses a range of color to visualize the range of values within the same variable. In contrast, a bivariate map uses different colors to show the relationship between two variables.

To create a bivariate map with nine classes, each variable must be sorted into three categories: low, medium and high. Where the dividing lines for these rankings are drawn impacts how the data are interpreted visually.

library(classInt)
library(cowplot)
library(DT)
library(tidyverse)
library(tidycensus)
library(ggplot2)
library(ggtext)
library(readr)
library(mapview)
library(sf)
library(tigris)



The data

The abortion data is loaded from per_cap.csv that was created from this script used to calculate the abortion rate for each county in Indiana from 2014 to 2021. For this map, it will be filtered to 2017 to 2021 to align with the years the income data was collected.

per_cap <- read_csv("Exported_Data/per_cap.csv")

# filter years to 2017:2021

per_cap <- per_cap %>% filter(year %in% 2017:2021) %>% select (-geometry)


The median household income data comes from the five year American Community Survey.


Abortion data

How are the abortion data distributed? The histogram provides a count of how many times a given rate appears in the data set. There are 92 counties being counted over five years for a total of 460 different rates.

ggplot(per_cap, aes(rate))+
  geom_histogram()


The data skew to the right with clear outliers. The data will be divided both ways - with and without the outliers - to better see the outliers’ impact on the location of our low, medium and high dividing lines.

To remove outlier rates, multiply the interquartile range - the distance between the 25th and 75th percentile - by 1.5 and filter rates that fall that amount below the first and above the third quartile.

# measurements for entire data set

ab_st_d <- sd(per_cap$rate)  # standard deviation
ab_mn <- mean(per_cap$rate) # mean
ab_median <- median(per_cap$rate) # median
ab_iqr <- IQR(per_cap$rate) # # interquartile range (distance between 25 and 75 percentiles)
pc_quant <- quantile(per_cap$rate) # quantiles 

# create per_cap2 dataframe for data with outliers removed

per_cap2 <-  per_cap %>% 
  filter(rate >= quantile(per_cap$rate)[2] - (ab_iqr*1.5)) %>% 
  filter(rate <= quantile(per_cap$rate)[3] + (ab_iqr*1.5))

# filtered measurements denoted by suffix _2

# assign measurements of location to objects
ab_st_d_2 <- sd(per_cap2$rate)  # filtered standard deviation
ab_median_2 <- median(per_cap2$rate) # filtered median
ab_mn_2 <- mean(per_cap2$rate) # filtered mean
ab_iqr_2 <- IQR(per_cap2$rate) # filtered IQR


Increasing the bin size of the histogram will provide more detail about the frequency of rates. A solid blue lines indicates the unfiltered mean, the dashed blue line represents the mean calculated after removing outliers. The yellow line is the median. Data within the shaded gray represent outliers.

ggplot(per_cap, aes(rate))+
  geom_histogram(bins=92)+
  geom_vline(xintercept = ab_mn, color = "blue")+ # mean
  geom_vline(xintercept = ab_median, color = "yellow")+ # median
  geom_vline(xintercept = ab_mn_2, color = "blue", linetype = "dashed")+ # mean
  geom_vline(xintercept = ab_median_2, color = "yellow", linetype="dashed")+
  annotate("rect", xmin=0, xmax=quantile(per_cap$rate)[2] - (ab_iqr*1.5),
           ymin = 0, ymax = 25, alpha = .15, fill = "gray")+
  annotate("rect", xmin=quantile(per_cap$rate)[3] + (ab_iqr*1.5), 
           xmax=max(per_cap$rate),
           ymin = 0, ymax = 25, alpha = .15, fill = "gray")+
  labs(subtitle = "Outliers in shaded gray area")+
  xlim(0, max(per_cap$rate))+
  theme_classic()


Comparing the abortion data measurements with and without outliers removed:

Category Original
data
1.5 *Outliers
removed
Number of counties 460 450
Mean 2.48 2.35
Standard Deviation 1.53 1.22
Interquartile Range 1.9 1.8
Range 0, 12.1 0, 5.1



A handful of counties are responsible for the right skew of the data and have an outsize impact on the range.



Classifying abortion data into low, medium and high categories

Divide abortion rate using standard deviation

Using standard deviation to divide the abortion rate into three classes would locate all values falling one standard deviation below the mean into the low category, rates within one standard devation of the mean in the medium category, and rates one standard deviation above the mean in the high category.

ggplot(per_cap, aes(rate))+
  geom_histogram(bins=92)+
  geom_vline(xintercept = ab_mn, color = "blue")+ # mean
  geom_vline(xintercept = ab_median, color = "yellow")+ # median
  annotate("rect", xmin=min(per_cap$rate), xmax=ab_mn-ab_st_d,
           ymin = 0, ymax = 25, alpha = .15, fill = "green")+
  annotate("rect", xmin=ab_mn-ab_st_d, xmax=ab_st_d+ab_mn,
           ymin = 0, ymax = 25, alpha = .15, fill = "blue")+
  annotate("rect", xmin=ab_mn+ab_st_d, xmax=max(per_cap$rate),
           ymin = 0, ymax = 25, alpha = .15, fill = "red")+
  labs(subtitle = "raw data")+
  theme_classic()

ggplot(per_cap, aes(rate))+
  geom_histogram(bins=92)+
  geom_vline(xintercept = ab_mn, color = "blue")+ # mean
  geom_vline(xintercept = ab_mn_2, color = "blue", linetype = "dashed")+
  geom_vline(xintercept = ab_median, color = "yellow")+ # median
  annotate("rect", xmin=min(per_cap$rate), xmax=ab_mn_2-ab_st_d_2,
           ymin = 0, ymax = 25, alpha = .15, fill = "green")+
  annotate("rect", xmin=ab_mn_2-ab_st_d_2, xmax=ab_st_d_2+ab_mn_2,
           ymin = 0, ymax = 25, alpha = .15, fill = "blue")+
  annotate("rect", xmin=ab_mn_2+ab_st_d_2, xmax=max(per_cap$rate),
           ymin = 0, ymax = 25, alpha = .15, fill = "red")+
  labs(subtitle = "filtered data")+
  theme_classic()



Count of counties with different abortion rate categories
Method Low Counties Medium Counties High Counties
Standard deviation 66 336 58
Standard deviation with outliers removed 86 281 93

Divide abortion rate into thirds

An alternative approach to using standard deviation to categorize low, medium and high rates would be to divide the rate into thirds.

# determine range of rate
og_inc <- range(per_cap$rate)[2]-range(per_cap$rate)[1]

# assign lower third rate and middle third rate to object
lw_trd <- og_inc * .333                            
md_trd <- og_inc * (2*.333 )

# repeat for data with 1.5*IQR outliers removed

f_inc <- range(per_cap2$rate)[2]-range(per_cap2$rate)[1]

f_lw_trd <-  f_inc *.333
f_md_trd <- f_inc * (2*.333)



Count of counties with different abortion rate categories
Method Low counties Medium counties High counties
Standard deviation 66 336 58
Standard deviation with outliers removed 86 281 93
Thirds 402 53 5
Thirds with outliers removed 145 205 110

Dividing the abortion rate using Jenks

Another way to divide the data is to use the jenks natural breaks method.

# load function for jenks breaks
# from: http://cainarchaeology.weebly.com/r-function-for-plotting-jenks-natural-breaks-classification.html

plotJenks <- function(data, n=3, brks.cex=0.70, top.margin=10, dist=5){
  df <- data.frame(sorted.values=sort(data, decreasing=TRUE))
  Jclassif <- classIntervals(df$sorted.values, n, style = "jenks") #requires the 'classInt' package
  test <- jenks.tests(Jclassif) #requires the 'classInt' package
  df$class <- cut(df$sorted.values, unique(Jclassif$brks), labels=FALSE, include.lowest=TRUE) #the function unique() is used to remove non-unique breaks, should the latter be produced. This is done because the cut() function cannot break the values into classes if non-unique breaks are provided
  if(length(Jclassif$brks)!=length(unique(Jclassif$brks))){
    info <- ("The method has produced non-unique breaks, which have been removed. Please, check '...$classif$brks'")
  } else {info <- ("The method did not produce non-unique breaks.")}
  loop.res <- numeric(nrow(df))
  i <- 1
  repeat{
    i <- i+1
    loop.class <- classIntervals(df$sorted.values, i, style = "jenks")
    loop.test <- jenks.tests(loop.class)
    loop.res[i] <- loop.test[[2]]
    if(loop.res[i]>0.9999){
      break
    }
  }
  max.GoF.brks <- which.max(loop.res)
  plot(x=df$sorted.values, y=c(1:nrow(df)), type="b", main=paste0("Jenks natural breaks optimization; number of classes: ", n), sub=paste0("Goodness of Fit: ", round(test[[2]],4), ". Max GoF (", round(max(loop.res),4), ") with classes:", max.GoF.brks), ylim =c(0, nrow(df)+top.margin), cex=0.75, cex.main=0.95, cex.sub=0.7, ylab="observation index", xlab="value (increasing order)")
  abline(v=Jclassif$brks, lty=3, col="red")
  text(x=Jclassif$brks, y= max(nrow(df)) + dist, labels=sort(round(Jclassif$brks, 2)), cex=brks.cex, srt=90)
  results <- list("info"=info, "classif" = Jclassif, "breaks.max.GoF"=max.GoF.brks, "class.data" = df)
  return(results)
}

The distribution with jenks natural breaks.



Count and percent of counties with different abortion rate categories
Method Low counties Medium counties High counties
Standard deviation 66 / 14% 336 / 73% 58 / 13%
Standard deviation with outliers removed 86 / 19% 281 / 61% 93 / 20%
Thirds 402 / 87% 53 / 12% 5 / 1%
Thirds with outliers removed 145 / 32% 205 / 45% 110 / 24%
Jenks natural breaks 234 / 51% 220 / 48% 6 / 1%
Jenks natural breaks with outliers removed 145 / 32% 180 / 39% 135 / 29%

Visualizing the impact of different abortion rate categorizations

#Add the geometry necessary for creating a map and then add the rankings to the data.

options(tigris_use_cache=TRUE)  # cache data

# import geometry for counties and GEOID
counties <- counties("IN", cb = T) %>% select(GEOID, geometry)

# make GEOID character in per_cap

per_cap$GEOID <- as.character(per_cap$GEOID)
# join county geometry with totals and population
per_cap <- left_join(per_cap, counties)


per_cap <- st_sf(per_cap)
# Add rankings for each categorization method used above.

per_cap <- per_cap %>% 
  mutate(ab_sd_rank = 
           case_when(
             annual_co_med < ab_mn-ab_st_d ~ "la",
             between(annual_co_med, ab_mn-ab_st_d, ab_mn+ab_st_d) ~ "ma",
             annual_co_med > ab_mn+ab_st_d ~ "ha"),
         ab_f_sd_rank = 
          case_when(
            annual_co_med <= ab_mn_2-ab_st_d_2 ~ "la",
            between(annual_co_med, ab_mn_2-ab_st_d_2, ab_mn_2+ab_st_d_2) ~ "ma",
            annual_co_med >= ab_mn_2+ab_st_d_2 ~ "ha"),
         ab_trd_rank = 
          case_when(
            annual_co_med < lw_trd ~ "la",
            between(annual_co_med, lw_trd, md_trd) ~ "ma",
            annual_co_med > md_trd ~ "ha"),
        ab_f_trd_rank = 
          case_when(
            annual_co_med < f_lw_trd ~ "la",
            between(annual_co_med, f_lw_trd, f_md_trd) ~ "ma",
            annual_co_med > f_md_trd ~ "ha"),
        jnks =
          case_when(
            annual_co_med < jnks_lw ~ "la",
            between(annual_co_med, jnks_lw, jnks_md) ~ "ma",
            annual_co_med > jnks_md ~ "ha"),
        f_jnks =
          case_when(
            annual_co_med < f_jnks_lw ~ "la",
            between(annual_co_med, f_jnks_lw, f_jnks_md) ~ "ma",
            annual_co_med > f_jnks_md ~ "ha")
          ) 


Dividing the abortion rate using standard deviation.



Dividing the abortion rate using standard deviation with outliers removed.


Dividing the abortion rate into thirds.


Dividing the abortion rate into thirds with outliers removed.



Dividing the abortion rate using jenks natural breaks.



Dividing the abortion rate using jenks natural breaks with outliers removed.



#### Comparing maps with abortion rates categories calculated using different methods



The method to divide the rate into three parts that seems to best match the abortion rate data is to use standard deviation after removing outliers calculated using the 1.5 times the interquartile range method.


Median income data

# load median income data
cache = TRUE

in_med <- 
  get_acs(
    geography = "county",
    year = 2021,
    state = "IN",
    variables = "B19013_001",output = "tidy",geometry = TRUE)

How are the median income data distributed? There are 92 counties represented in this data set.

ggplot(in_med, aes(estimate))+
  geom_histogram()

The data skew right with outliers on the right. The data will be divided with and without outliers to better see the outliers’ impact on the location of our low, medium, and high dividing lines.

To remove outlier rates, multipy the interquartile range (the distance between the 25th and 75th percentile) by 1.5 and filter rates that fall that amount below the first and that amount above the third quartile.

Increasing the bin size of the histogram will provide more detail about the frequency of rates. A solid blue lines represents the unfiltered mean, a dashed blue line represents the mean calculated after removing outliers. The yellow line is the median. Data within the shaded gray represent outliers.

ggplot(in_med, aes(estimate))+
  geom_histogram(bins=92)+
  geom_vline(xintercept = med_mn, color = "blue")+ # mean
  geom_vline(xintercept = med_med, color = "yellow")+ # median
  geom_vline(xintercept = f_med_mn, color = "blue", linetype = "dashed")+ # mean
  geom_vline(xintercept = med_med, color = "yellow", linetype="dashed")+
  annotate("rect", xmin=min(in_med$estimate), xmax=med_qnt[2] - (med_iqr*1.5),
           ymin = 0, ymax = 10, alpha = .15, fill = "gray")+
  annotate("rect", xmin=med_qnt[3] + (med_iqr*1.5), 
           xmax=max(in_med$estimate),
           ymin = 0, ymax = 10, alpha = .15, fill = "gray")+
  labs(subtitle = "Outliers in shaded gray area")+
  theme_classic()

Comparing the income data with and without outliers removed:

Category Original
data
1.5 *Outliers
removed
Number of counties 92 79
Mean 60869 57747
Standard Deviation 10235 5890
Interquartile Range 10085 8186
Range 42465, 104858 42465, 69440

Classifying income data into low, medium and high categories


#### Using standard deviation to divide the income range

Using standard deviation to divide the abortion rate into three classes would place all values located one standard deviation below the mean in the low category, rates within one standard deviation of the mean in the medium category and rates one standard deviation above the mean in the high category.

ggplot(in_med, aes(estimate))+
  geom_histogram(bins=92)+
  geom_vline(xintercept = med_mn, color = "blue")+ # mean
  geom_vline(xintercept = med_med, color = "yellow")+ # median
  annotate("rect", xmin=min(in_med$estimate), xmax=med_mn-med_sd,
           ymin = 0, ymax = 10, alpha = .2, fill = "green")+
  annotate("rect", xmin=med_mn-med_sd, xmax=med_mn+med_sd,
           ymin = 0, ymax = 10, alpha = .2, fill = "blue")+
  annotate("rect", xmin=med_mn+med_sd, xmax=max(in_med$estimate),
           ymin = 0, ymax = 10, alpha = .2, fill = "red")+
  labs(subtitle = "standard deviation")+
  theme_classic()

ggplot(in_med, aes(estimate))+
  geom_histogram(bins=92)+
  geom_vline(xintercept = f_med_mn, color = "blue")+ # mean
  geom_vline(xintercept = f_med_med, color = "yellow")+ # median
  annotate("rect", xmin=min(in_med$estimate), xmax=f_med_mn-f_med_sd,
           ymin = 0, ymax = 10, alpha = .2, fill = "green")+
  annotate("rect", xmin=f_med_mn-f_med_sd, xmax=f_med_mn+f_med_sd,
           ymin = 0, ymax = 10, alpha = .2, fill = "blue")+
  annotate("rect", xmin=f_med_mn+f_med_sd, xmax=max(in_med$estimate),
           ymin = 0, ymax = 10, alpha = .2, fill = "red")+
  labs(subtitle = "standard deviation, outliers removed")+
  theme_classic()

Count of counties by different income rate category classification methods
Method Low Counties Medium Counties High Counties
Standard deviation 10 71 11
Standard deviation with outliers removed 12 55 25

Dividing the income range into equal thirds


An alternative approach to using standard deviation to categorize low, medium and high rates would be to divide the range of the rate - the amount between the minimum and maximum values - into thirds.

# determine range of rate
med_og_inc <- range(in_med2$estimate)[2]-range(in_med2$estimate)[1]

# assign lower third line and middle third line to object
med_lw_trd <- med_og_inc * .333                            
med_md_trd <- med_og_inc * (2*.333 )

# repeat for data with 1.5*IQR outliers removed

f_inc <- range(in_med3$estimate)[2]-range(in_med3$estimate)[1]

f_med_lw_trd <-  f_inc *.333
f_med_md_trd <- f_inc * (2*.333)

Count of counties by different income rate category classification methods
Method Low Counties Medium Counties High Counties
Standard deviation 10 71 11
Standard deviation with outliers removed 12 55 25
Thirds 63 25 4
Thirds with outliers removed 10 43 39

Dividing the income range using Jenks

Another way to divde the data is use the jenks natural breaks method.

# load function for jenks breaks
# from: http://cainarchaeology.weebly.com/r-function-for-plotting-jenks-natural-breaks-classification.html

plotJenks <- function(data, n=3, brks.cex=0.70, top.margin=10, dist=5){
  df <- data.frame(sorted.values=sort(data, decreasing=TRUE))
  Jclassif <- classIntervals(df$sorted.values, n, style = "jenks") #requires the 'classInt' package
  test <- jenks.tests(Jclassif) #requires the 'classInt' package
  df$class <- cut(df$sorted.values, unique(Jclassif$brks), labels=FALSE, include.lowest=TRUE) #the function unique() is used to remove non-unique breaks, should the latter be produced. This is done because the cut() function cannot break the values into classes if non-unique breaks are provided
  if(length(Jclassif$brks)!=length(unique(Jclassif$brks))){
    info <- ("The method has produced non-unique breaks, which have been removed. Please, check '...$classif$brks'")
  } else {info <- ("The method did not produce non-unique breaks.")}
  loop.res <- numeric(nrow(df))
  i <- 1
  repeat{
    i <- i+1
    loop.class <- classIntervals(df$sorted.values, i, style = "jenks")
    loop.test <- jenks.tests(loop.class)
    loop.res[i] <- loop.test[[2]]
    if(loop.res[i]>0.9999){
      break
    }
  }
  max.GoF.brks <- which.max(loop.res)
  plot(x=df$sorted.values, y=c(1:nrow(df)), type="b", main=paste0("Jenks natural breaks optimization; number of classes: ", n), sub=paste0("Goodness of Fit: ", round(test[[2]],4), ". Max GoF (", round(max(loop.res),4), ") with classes:", max.GoF.brks), ylim =c(0, nrow(df)+top.margin), cex=0.75, cex.main=0.95, cex.sub=0.7, ylab="observation index", xlab="value (increasing order)")
  abline(v=Jclassif$brks, lty=3, col="red")
  text(x=Jclassif$brks, y= max(nrow(df)) + dist, labels=sort(round(Jclassif$brks, 2)), cex=brks.cex, srt=90)
  results <- list("info"=info, "classif" = Jclassif, "breaks.max.GoF"=max.GoF.brks, "class.data" = df)
  return(results)
}

Count and percent of counties by different income rate category classification methods
Method Low Counties Medium Counties High Counties
Standard deviation 10 / 11% 71 / 77% 11 / 12%
Standard deviation with outliers removed 12 / 13% 55 / 60% 25 / 27%
Thirds 63 / 68% 25 / 27% 4 / 4%
Thirds with outliers removed 10 / 11% 43 / 47% 39 / 42%
Jenks 43 / 47% 41 / 45% 8 / 9%
Jenks with outliers removed 12 / 13% 39 / 42% 28 / 30%

Visualizing the impact of different income categorizations

Visualizing how the map is impacted by the method used to divide the income rate.

# Because the median income data was imported from tidycensus the necessary geometry data for mapping is present, but the rankings still need to be added.

in_med <- in_med %>% 
  mutate(sd_rank = 
           case_when(
             estimate < med_mn-med_sd ~ "li",
             between(estimate, med_mn-med_sd, med_mn+med_sd) ~ "mi",
             estimate > med_mn+med_sd ~ "hi"),
         f_sd_rank = 
           case_when(
          estimate < f_med_mn-f_med_sd ~ "li",
             between(estimate, f_med_mn-f_med_sd, f_med_mn+f_med_sd) ~ "mi",
             estimate > f_med_mn+f_med_sd ~ "hi"),
         trd_rank = 
          case_when(
            estimate < min(in_med$estimate) + med_lw_trd ~ "li",
            between(estimate, min(in_med$estimate) + med_lw_trd, min(in_med$estimate) + 2*med_lw_trd) ~ "mi",
            estimate > min(in_med$estimate) + 2*med_lw_trd ~ "hi"),
        f_trd_rank = 
          case_when(
            estimate < min(in_med$estimate) + f_med_lw_trd ~ "li",
            between(estimate, min(in_med$estimate) + f_med_lw_trd, min(in_med$estimate) + 2*f_med_lw_trd) ~ "mi",
            estimate > min(in_med$estimate) + 2*f_med_lw_trd ~ "hi"),
        in_jnks = 
          case_when(
            estimate < inc_jnks_lw ~ "li",
            between(estimate, inc_jnks_lw, inc_jnks_md) ~ "mi",
            estimate > inc_jnks_md ~ "hi"
          ),
        f_in_jnks = 
          case_when(
            estimate < f_inc_jnks_lw ~ "li",
            between(estimate, f_inc_jnks_lw, f_inc_jnks_md) ~ "mi",
            estimate > f_inc_jnks_md ~ "hi"
          ))


Dividing the income rate using standard deviation.



Dividing the income rate using standard deviation with outliers removed.



Dividing the income rate into thirds.



Dividing the income rate into thirds with outliers removed.



Dividing the rate into thirds using jenks natural breaks.

Dividing the rate into thirds using jenks natural breaks with outliers removed.


Comparing the maps of income rates side by side


Raw data

Data with outliers removed


Dividing the income rate using Jenks natural breaks yields the best results.


The bivariate map

The data for abortion rate and the data for median income need to be combined.

inc_ab <- per_cap %>% 
  left_join(in_med, per_cap, by = "GEOID") %>% 
    select(-c(variable, NAME))

# add column for joint ranking

inc_ab <- inc_ab %>% mutate(rank = paste0(ab_f_sd_rank, in_jnks),
                            .after = Name)

Prepare a color scheme: Josh Stevens posted a helpful explanation about creating a bivariate map color scheme. This map uses one of his suggested palettes. Similarly, Timo Grossenbacher’s post about creating a bivariate map in R influenced the process below.

To keep things straight it helps to visualize 3 x 3 grid with abortion increasing from left to right in three stages and income increasing from bottom to top. Each square gets a code to make it easier to keep things straight.

  # a3 b3 c3
  # a2 b2 c2
  # a1 b1 c1

Each location on the grid will be assigned a value and a corresponding color. The values for low, medium and high were assigned for each data set during the steps above but they’ll need to be combined into one code and then assigned a color.

First, assign the colors.

# colors 
col <- c(
  "#e8e8e8",  # a1  low abortion - low income       a1: la-li           
  "#b5c0da",  # b1 medium abortion - low income     b1: ma-li     
  "#6c83b5",  # c1 high abortion - low income       c1: ha-li    
  "#b8d6be",  # a2 low abortion - medium income     a2: la-mi
  "#90b2b3",  # b2 medium abortion - medium income  b2: ma-mi
  "#567994",  # c2 high abortion - medium income    c2: ha-mi
  "#73ae80",  # a3 low abortion - high income       a3: la-hi
  "#5a9178",  # b3 medium abortion - high income    b3: ma-hi
  "#2a5a5b")  # c3 high abortion - high income      c3: ha-hi

# positions of matrix
col_pos <- c(
  "a1",
  "b1",
  "c1",
  "a2",
  "b2",
  "c2",
  "a3",
  "b3",
  "c3")

# a description for easy reference 

desc <- c("low abortion - low income",        #a1
          "medium abortion - low income",     #b1
          "high abortion - low income",       #c1
          "low abortion - medium income",     #a2
          "medium abortion - medium income",  #b2
          "high abortion - medium income",    #c2
          "low abortion - high income",       #a3
          "medium abortion - high income",    #b3
          "high abortion - high income")      #c3

# a two category code to assign to categories (abortion rate and median income)

rank <- c("lali", "mali", "hali", "lami", "mami", "hami", "lahi", "mahi",
          "hahi")

# combine into tibble

biv_col <- tibble(pos = col_pos, color = col, rank = rank, desc=desc)

Join the dataframe with the color and rank categories to the dataframe with the ranked abortion rate and median income data.

# join biv_color to inc_ab dataframe ####
# using standard deviation 1.5*IQR abortion rate

biv <- left_join(inc_ab, biv_col, by = "rank")

sf_biv <- st_sf(biv)

Create the legend.

# create legend: ####
legend <- 
  
biv_col %>% mutate(
  x = str_sub(pos, start = 1L, end = 1L),
  y = str_sub(pos, start = 2L, end = 2L)
) %>% 
ggplot()+
  geom_tile(
    aes(x, y,
      fill = color)
  ) +
  scale_fill_identity() +
  labs(
    x = paste0("Increasing abortion  ", "\U2192"),
    y = paste0("Increasing income    ", "\U2192")
  )+
  theme_void()+
  theme(
    axis.title.x = element_text(size = 8,
                                hjust = .8),
    axis.title.y = element_text(size = 8,
                                vjust = 0,
                                hjust = 1,
                                angle = 90),
    axis.text = element_blank()
  )+
  coord_fixed(ratio = 1:1)

Create a the map and add the legend.

# unclear why labels aren't showing up in rendered html; they'll remain for now.

# calculate center of one county for each rate/income rating matrix corner combination and assign x, y ####


# low abortion, low income county center point
lali_county <-  sf_biv %>% filter(year == 2021) %>% 
  select(Name, GEOID, rank, geometry) %>% 
  filter(rank == "la-li") %>% 
  slice(2) %>%  # can adjust slice to select needed la-li county 
  st_centroid(geometry)

lali_county <- do.call(rbind, st_geometry(lali_county))

lali_county_x <- lali_county[1]
lali_county_y <- lali_county[2]

# high abortion, high income county center point
hahi_county <-  sf_biv %>% filter(year == 2021) %>% 
  select(Name, GEOID, rank, geometry) %>% 
  filter(rank == "ha-hi") %>% 
  slice(2) %>%  # can adjust slice to select needed la-li county
  st_centroid(geometry)

hahi_county <- do.call(rbind, st_geometry(hahi_county))

hahi_county_x <- hahi_county[1]
hahi_county_y <- hahi_county[2]

# high abortion, low income county center point
hali_county <-  sf_biv %>% filter(year == 2021) %>% 
  select(Name, GEOID, rank, geometry) %>% 
  filter(rank == "ha-li") %>% 
  slice(1) %>%  # can adjust slice to select needed la-li county
  st_centroid(geometry)

hali_county <- do.call(rbind, st_geometry(hali_county))

hali_county_x <- hali_county[1]
hali_county_y <- hali_county[2]

# low abortion, high income county center point
lahi_county <-  sf_biv %>% filter(year == 2021) %>% 
  select(Name, GEOID, rank, geometry) %>% 
  filter(rank == "la-hi") %>% 
  slice(1) %>%  # can adjust slice to select needed la-li county
  st_centroid(geometry)

lahi_county <- do.call(rbind, st_geometry(lahi_county))

lahi_county_x <- lahi_county[1]
lahi_county_y <- lahi_county[2]

# add panel color 
panel_c <- "#fdfdf2"

# create plot 

p <- 
  ggplot()+
  geom_sf(data = sf_biv %>% filter(year == 2021),
          aes(fill=color),
          color = "white",
          size = 0.1,
          alpha = .95,
          show.legend = F)+
  scale_fill_identity()+
  # expand area around map to accomodate text labels
  coord_sf(xlim = c(-89, -84),
           ylim = c(37.75, 41.65),
           expand = TRUE)+
  theme_void()+
  theme(
    plot.margin = unit(c(.25, 1, 3, 1), "cm")
  )+
  #  main labels ####
labs(
  title = title,
  subtitle = s_title,
  caption = caption
)+
  # theme adjustments ####
theme(
  axis.text = element_blank(),
  axis.ticks = element_blank(),
  axis.line = element_blank(),
  panel.background = element_rect(color = NA,
                                  fill = panel_c),
  plot.background  = element_rect(color = NA,
                                  fill = panel_c),
  plot.title = element_textbox_simple(
    size = 22, lineheight = 1, family = "serif", padding = margin(.5, 5, 3, 1)),
  plot.subtitle = element_textbox_simple(
    size = 14, lineheight = 1, family = "sans", padding = margin(2, 0, 1, 0)),
  plot.caption = element_textbox_simple(
    size = 10, lineheight = 1.2, family = "sans", padding = margin(.5, 0, 1, 0),
    halign = 1),
  plot.caption.position = "plot"
)+
  # map annotations #### 
# low abortion low income county curve
geom_curve( 
  aes(
    x = lali_county_x/1.009,
    xend = lali_county_x,
    y = lali_county_y*1.004,
    yend = lali_county_y),
  curvature = -.2,
  linewidth = .75,
  color = "dark gray",
  arrow = arrow(
    length = unit(.2, "cm")))+  
  # low abortion low income county text
  annotate("text", lali_county_x/1.005, lali_county_y*1.008,
           colour = "black",
           size = 3.5,
           label = "Light gray counties\nmean low abortion\nand low income",
           lineheight = .9,
           hjust=0
  )+
  # high abortion high income county curve
  geom_curve( 
    aes(
      x = hahi_county_x*1.009,
      xend = hahi_county_x,
      y = hahi_county_y/1.003,
      yend = hahi_county_y),
    curvature = .2,
    linewidth = .75,
    color = "dark gray",
    arrow = arrow(
      length = unit(.2, "cm")))+
  # high abortion high income county text
  annotate("text", hahi_county_x*1.019, hahi_county_y/1.001,
           colour = "black",
           size = 3.5,
           label = "Dark bluegreen counties\nmean high abortion\nand high Income",
           lineheight = .9,
           hjust = 0
  )+
  # high abortion, low income county curve
  geom_curve( 
    aes(
      x = hali_county_x*1.009,
      xend = hali_county_x,
      y = hali_county_y/1.004,
      yend = hali_county_y),
    curvature = -.2,
    linewidth = .75,
    color = "dark gray",
    arrow = arrow(
      length = unit(.2, "cm")))+
  # high abortion, low income county text
  annotate("text", hali_county_x*1.0137, hali_county_y/1.0075,
           colour = "black",
           size = 3.5,
           label = "Blue counties\nmean high abortion\nand low income",
           lineheight = .9,
           hjust = 0
  )+
  # low abortion, high income county curve
  geom_curve( 
    aes(
      x = lahi_county_x/1.009,
      xend = lahi_county_x/1.0025,
      y = lahi_county_y*1.004,
      yend = lahi_county_y),
    curvature = -.2,
    linewidth = .75,
    color = "dark gray",
    arrow = arrow(
      length = unit(.2, "cm")))+
  # low abortion, high income county curve  
  annotate("text", lahi_county_x/1.004, lahi_county_y*1.0085,
           colour = "black",
           size = 3.5,
           label = "Green counties\nmean low abortion\nand high income",
           lineheight = .9,
           hjust = 0
  )

ggdraw() +
  draw_plot(p, 0, -.05, 1, 1) +
  draw_plot(legend, x=0.77, y=0.14, 0.13, 0.13, scale = 1.2)+
  theme(plot.background = element_rect(fill = panel_c, color = NA))



Explore the data